{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Neural Language Models\n",
    "Status of Notebook: Work in Progress\n",
    "\n",
    "Reference: https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf\n",
    "\n",
    "Dynet Version: https://github.com/neubig/nn4nlp-code/blob/master/02-lm/nn-lm.py\n",
    "\n",
    "Old PyTorch version: https://github.com/neubig/nn4nlp-code/blob/master/02-lm-pytorch/nn-lm-batch.py\n",
    "\n",
    "Additions compared to `nn.lm.ipnyb`:\n",
    "- Cleaned up model architecture code\n",
    "- Added Dropout\n",
    "- Using different initial learning rate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import random\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import math\n",
    "import time\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download the Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# uncomment to download the datasets\n",
    "#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/test.txt\n",
    "#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/train.txt\n",
    "#!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/ptb/valid.txt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Process the Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# function to read in data, pro=ess each line and split columns by \" ||| \"\n",
    "def read_data(filename):\n",
    "    data = []\n",
    "    with open(filename, \"r\") as f:\n",
    "        for line in f:\n",
    "            line = line.strip().split(\" \")\n",
    "            data.append(line)\n",
    "    return data\n",
    "\n",
    "# read the data\n",
    "train_data = read_data('data/ptb/train.txt')\n",
    "val_data = read_data('data/ptb/valid.txt')\n",
    "\n",
    "# creating the word and tag indices and special tokens\n",
    "word_to_index = {}\n",
    "index_to_word = {}\n",
    "word_to_index[\"<s>\"] = len(word_to_index)\n",
    "index_to_word[len(word_to_index)-1] = \"<s>\"\n",
    "word_to_index[\"<unk>\"] = len(word_to_index) # add <UNK> to dictionary\n",
    "index_to_word[len(word_to_index)-1] = \"<unk>\"\n",
    "\n",
    "# create word to index dictionary and tag to index dictionary from data\n",
    "def create_dict(data, check_unk=False):\n",
    "    for line in data:\n",
    "        for word in line:\n",
    "            if check_unk == False:\n",
    "                if word not in word_to_index:\n",
    "                    word_to_index[word] = len(word_to_index)\n",
    "                    index_to_word[len(word_to_index)-1] = word\n",
    "            \n",
    "            # has no effect because data already comes with <unk>\n",
    "            # should work with data without <unk> already processed\n",
    "            else: \n",
    "                if word not in word_to_index:\n",
    "                    word_to_index[word] = word_to_index[\"<unk>\"]\n",
    "                    index_to_word[len(word_to_index)-1] = word\n",
    "\n",
    "create_dict(train_data)\n",
    "create_dict(val_data, check_unk=True)\n",
    "\n",
    "# create word and tag tensors from data\n",
    "def create_tensor(data):\n",
    "    for line in data:\n",
    "        yield([word_to_index[word] for word in line])\n",
    "\n",
    "train_data = list(create_tensor(train_data))\n",
    "val_data = list(create_tensor(val_data))\n",
    "\n",
    "number_of_words = len(word_to_index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In our implementation we are using batched training. There are a few differences from the original implementation found [here](https://github.com/neubig/nn4nlp-code/blob/master/02-lm/loglin-lm.py). "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
    "\n",
    "N = 2 # length of the n-gram\n",
    "EMB_SIZE = 128 # size of the embedding\n",
    "HID_SIZE = 128 # size of the hidden layer\n",
    "\n",
    "# Neural LM\n",
    "class FNN_LM(nn.Module):\n",
    "    def __init__(self, number_of_words, ngram_length, EMB_SIZE, HID_SIZE, dropout):\n",
    "        super(FNN_LM, self).__init__()\n",
    "\n",
    "        # embedding layer\n",
    "        self.embedding = nn.Embedding(number_of_words, EMB_SIZE)\n",
    "\n",
    "        self.fnn = nn.Sequential(\n",
    "            # hidden layer\n",
    "            nn.Linear(EMB_SIZE * ngram_length, HID_SIZE),\n",
    "            nn.Tanh(),\n",
    "            # dropout layer\n",
    "            nn.Dropout(dropout),\n",
    "            # output layer\n",
    "            nn.Linear(HID_SIZE, number_of_words)\n",
    "        )\n",
    "\n",
    "    def forward(self, x):\n",
    "        embs = self.embedding(x)              # Size: [batch_size x num_hist x emb_size]\n",
    "        feat = embs.view(embs.size(0), -1)    # Size: [batch_size x (num_hist*emb_size)]\n",
    "        logit = self.fnn(feat)                # Size: batch_size x num_words                    \n",
    "        return logit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model Settings and Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = FNN_LM(number_of_words, N, EMB_SIZE, HID_SIZE, dropout=0.2)\n",
    "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n",
    "criterion = torch.nn.CrossEntropyLoss(reduction=\"sum\")\n",
    "\n",
    "if torch.cuda.is_available():\n",
    "    model.to(device)\n",
    "\n",
    "# function to calculate the sentence loss\n",
    "def calc_sent_loss(sent):\n",
    "    S = word_to_index[\"<s>\"]\n",
    "    \n",
    "    # initial history is equal to end of sentence symbols\n",
    "    hist = [S] * N\n",
    "    \n",
    "    # collect all target and histories\n",
    "    all_targets = []\n",
    "    all_histories = []\n",
    "    \n",
    "    # step through the sentence, including the end of sentence token\n",
    "    for next_word in sent + [S]:\n",
    "        all_histories.append(list(hist))\n",
    "        all_targets.append(next_word)\n",
    "        hist = hist[1:] + [next_word]\n",
    "\n",
    "    logits = model(torch.LongTensor(all_histories).to(device))\n",
    "    loss = criterion(logits, torch.LongTensor(all_targets).to(device))\n",
    "\n",
    "    return loss\n",
    "\n",
    "MAX_LEN = 100\n",
    "# Function to generate a sentence\n",
    "def generate_sent():\n",
    "    S = word_to_index[\"<s>\"]\n",
    "    hist = [S] * N\n",
    "    sent = []\n",
    "    while True:\n",
    "        logits = model(torch.LongTensor([hist]).to(device))\n",
    "        p = torch.nn.functional.softmax(logits) # 1 x number_of_words\n",
    "        next_word = p.multinomial(num_samples=1).item()\n",
    "        if next_word == S or len(sent) == MAX_LEN:\n",
    "            break\n",
    "        sent.append(next_word)\n",
    "        hist = hist[1:] + [next_word]\n",
    "    return sent"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--finished 5000 sentences (words/sec=12807.67)\n",
      "--finished 10000 sentences (words/sec=12788.71)\n",
      "--finished 15000 sentences (words/sec=12807.44)\n",
      "--finished 20000 sentences (words/sec=12801.59)\n",
      "--finished 25000 sentences (words/sec=12852.69)\n",
      "--finished 30000 sentences (words/sec=12843.39)\n",
      "--finished 35000 sentences (words/sec=12835.04)\n",
      "--finished 40000 sentences (words/sec=12816.01)\n",
      "iter 0: train loss/word=6.1274, ppl=458.2398, (words/sec=12801.17)\n",
      "iter 0: dev loss/word=5.8676, ppl=353.3835, (words/sec=1.44s)\n",
      "it will change at georgia & co. got instead of totally a appointment from the big bankers posted <unk> & co. also received that brokers\n",
      "one and claim the politicians amount for <unk> the measure of the california santa contract\n",
      "our birth capitol led the giant <unk> by an <unk> market in the central <unk> held the rise of the company 's sheet that the irs on britain dollars\n",
      "yesterday 's jail & <unk> investigations on news for buying creditors has lower market for polish statement so and now in a bill to government americans system to my march and programs of links stock-market program charlotte nasdaq lowest judge <unk> provide an final state university of foot an woman spokesman for something he was very constitution on the new post\n",
      "it N also buy-out of bank in may industry mr. phelan said <unk> in his current N N owned of closely bartlett below minister blocking which mr. repeat and unemployment to de our own news to submit resolution trust said a rivals on the reached which now end for most cautioned failed more than N remic mortgage officials and and goldman sachs & co. currently myself mrs. mandatory almost <unk> to the hoffman <unk> greater silver kidder peabody & co. does n't <unk> any grip from the buy-out change in drexel 's third-quarter profit of N N to speculation a\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/nlp/lib/python3.7/site-packages/ipykernel_launcher.py:38: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--finished 5000 sentences (words/sec=12587.62)\n",
      "--finished 10000 sentences (words/sec=12652.41)\n",
      "--finished 15000 sentences (words/sec=12740.18)\n",
      "--finished 20000 sentences (words/sec=12763.71)\n",
      "--finished 25000 sentences (words/sec=12753.94)\n",
      "--finished 30000 sentences (words/sec=12754.24)\n",
      "--finished 35000 sentences (words/sec=12762.18)\n",
      "--finished 40000 sentences (words/sec=12740.41)\n",
      "iter 1: train loss/word=5.7389, ppl=310.7324, (words/sec=12744.21)\n",
      "iter 1: dev loss/word=5.7766, ppl=322.6629, (words/sec=1.40s)\n",
      "the advertising for champion the dollar was named whose damage was down from lawyers and the new england told them need\n",
      "rumors with cents a share\n",
      "justice general operations in chicago\n",
      "british bought what of going to pay to rates since april\n",
      "according to an <unk> family\n",
      "--finished 5000 sentences (words/sec=12702.39)\n",
      "--finished 10000 sentences (words/sec=12731.82)\n",
      "--finished 15000 sentences (words/sec=12755.89)\n",
      "--finished 20000 sentences (words/sec=12828.83)\n",
      "--finished 25000 sentences (words/sec=12836.63)\n",
      "--finished 30000 sentences (words/sec=12801.01)\n",
      "--finished 35000 sentences (words/sec=12803.18)\n",
      "--finished 40000 sentences (words/sec=12779.24)\n",
      "iter 2: train loss/word=5.5996, ppl=270.3145, (words/sec=12792.34)\n",
      "iter 2: dev loss/word=5.7464, ppl=313.0468, (words/sec=1.40s)\n",
      "french his experience within george bush expected luxury world culture by planning\n",
      "complete as example this scheduled other sellers operations\n",
      "much of the retail\n",
      "just scheduled time to foreign exchange cigarette merchandise outlets in the market 's $ N a share a year earlier branch in the turmoil before our company\n",
      "the manufacturing products construction <unk> has about $ N million or $ N million from $ N million in bridge\n",
      "--finished 5000 sentences (words/sec=12953.86)\n",
      "--finished 10000 sentences (words/sec=12970.24)\n",
      "--finished 15000 sentences (words/sec=12896.14)\n",
      "--finished 20000 sentences (words/sec=12875.42)\n",
      "--finished 25000 sentences (words/sec=12833.31)\n",
      "--finished 30000 sentences (words/sec=12839.11)\n",
      "--finished 35000 sentences (words/sec=12822.49)\n",
      "--finished 40000 sentences (words/sec=12814.87)\n",
      "iter 3: train loss/word=5.5124, ppl=247.7381, (words/sec=12819.57)\n",
      "iter 3: dev loss/word=5.7235, ppl=305.9709, (words/sec=1.39s)\n",
      "ago\n",
      "british community interest reserves in a low $ N a tax ministry in japan and wants to say it shows the consumer price network\n",
      "expects operations for this first maryland staff studies in beijing\n",
      "u.s. government bills here have been implemented\n",
      "computer software inc. and expected that were out the <unk> moscow who illegally a <unk> <unk> stage sets for over $ N million a year earlier\n",
      "--finished 5000 sentences (words/sec=12860.84)\n",
      "--finished 10000 sentences (words/sec=12756.10)\n",
      "--finished 15000 sentences (words/sec=12795.20)\n",
      "--finished 20000 sentences (words/sec=12799.80)\n",
      "--finished 25000 sentences (words/sec=12830.27)\n",
      "--finished 30000 sentences (words/sec=12820.51)\n",
      "--finished 35000 sentences (words/sec=12821.23)\n",
      "--finished 40000 sentences (words/sec=12839.05)\n",
      "iter 4: train loss/word=5.4502, ppl=232.8159, (words/sec=12841.22)\n",
      "iter 4: dev loss/word=5.7149, ppl=303.3545, (words/sec=1.40s)\n",
      "but the pilots are not profitable if mr. <unk> said\n",
      "his trading know that cathay is through the research session mr. brooks and center usually raised its quarterly dividend\n",
      "and <unk> <unk> kept a <unk> turn on the moment he said\n",
      "but a church premium still the stockholders die\n",
      "if they benefited from the san couple has a china 's strategy while government prices\n"
     ]
    }
   ],
   "source": [
    "# start training\n",
    "for ITER in range(5):\n",
    "    # training\n",
    "    random.shuffle(train_data)\n",
    "    model.train()\n",
    "    train_words, train_loss = 0, 0.0\n",
    "    start = time.time()\n",
    "    for sent_id, sent in enumerate(train_data):        \n",
    "        my_loss = calc_sent_loss(sent)\n",
    "        train_loss += my_loss.item()\n",
    "        train_words += len(sent)\n",
    "        optimizer.zero_grad()\n",
    "        my_loss.backward()\n",
    "        optimizer.step()\n",
    "        if (sent_id+1) % 5000 == 0:\n",
    "            print(\"--finished %r sentences (words/sec=%.2f)\" % (sent_id+1, train_words/(time.time()-start)))\n",
    "    print(\"iter %r: train loss/word=%.4f, ppl=%.4f, (words/sec=%.2f)\" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), train_words/(time.time()-start)))\n",
    "\n",
    "    # evaluation\n",
    "    model.eval()\n",
    "    dev_words, dev_loss = 0, 0.0\n",
    "    start = time.time()\n",
    "    for sent_id, sent in enumerate(val_data):\n",
    "        my_loss = calc_sent_loss(sent)\n",
    "        dev_loss += my_loss.item()\n",
    "        dev_words += len(sent)\n",
    "    print(\"iter %r: dev loss/word=%.4f, ppl=%.4f, (words/sec=%.2fs)\" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))\n",
    "\n",
    "    # Generate a few sentences\n",
    "    for _ in range(5):\n",
    "        sent = generate_sent()\n",
    "        print(\" \".join([index_to_word[x] for x in sent]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nlp",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "154abf72fb8cc0db1aa0e7366557ff891bff86d6d75b7e5f2e68a066d591bfd7"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
